I. Initialization

Import necessary libraries

# Libraries
library(tidyverse)
library(MASS)
library(caret)
library(Amelia)
library(mice)
library(UBL)
library(bestNormalize)
library(VIM)
library(gridExtra)
library(latex2exp)

Load IMPACT dataframe

# Load function to fix IMPACT variable types in the dataframe
source('functions/fix_impact_dataframe.R')

# Load cleaned sample set labels for stratified cross-validation sampling
impact.dataframe <- read.csv('../impact_dataframe.csv') %>% fix.impact.dataframe()

# Convert pupillary reactivity variable type to integer for imputation
impact.dataframe$unreactive_pupils <- as.integer(impact.dataframe$unreactive_pupils) - 1

# Produce table of summary information of Box-Cox transformed dataset
knitr::kable(summary(impact.dataframe))
entity_id PatientType age unreactive_pupils GCS GCSm Hb glu GOSE hypoxia hypotension marshall tsah EDH
1aEp804: 1 1: 668 Min. :18.00 Min. :0.0000 Min. : 3.00 Min. :1.000 Min. : 4.50 Min. : 1.890 1: 467 0:3314 0:3310 Min. :1.000 0 :1568 0 :2660
1bgj055: 1 2:1180 1st Qu.:35.00 1st Qu.:0.0000 1st Qu.: 9.00 1st Qu.:5.000 1st Qu.:12.30 1st Qu.: 6.000 3: 319 1: 259 1: 263 1st Qu.:1.000 1 :1418 1 : 328
1BkM368: 1 3:1725 Median :53.00 Median :0.0000 Median :15.00 Median :6.000 Median :13.60 Median : 7.100 4: 154 NA NA Median :2.000 NA’s: 587 NA’s: 585
1bLs736: 1 NA Mean :51.52 Mean :0.1835 Mean :11.91 Mean :4.903 Mean :13.27 Mean : 7.809 5: 342 NA NA Mean :2.396 NA NA
1CLZ132: 1 NA 3rd Qu.:67.00 3rd Qu.:0.0000 3rd Qu.:15.00 3rd Qu.:6.000 3rd Qu.:14.70 3rd Qu.: 8.660 6: 379 NA NA 3rd Qu.:2.000 NA NA
1eUQ847: 1 NA Max. :96.00 Max. :2.0000 Max. :15.00 Max. :6.000 Max. :21.30 Max. :55.500 7: 623 NA NA Max. :6.000 NA NA
(Other):3567 NA NA NA’s :206 NA’s :128 NA’s :75 NA’s :1108 NA’s :1374 8:1289 NA NA NA’s :591 NA NA

Load testing folds for repeated cross-validation splits

testing.folds <- readRDS('../testing_folds.rds')

I. Multiple imputation and evaluation of assumptions

Perform multiple imputation with chained equations

# Remove entity ID and GOSE as predictors for MICE
pred.matrix <- make.predictorMatrix(impact.dataframe)
pred.matrix['GOSE',] <- 0
pred.matrix[,'GOSE'] <- 0
pred.matrix['entity_id',] <- 0
pred.matrix[,'entity_id'] <- 0

# Run model through MICE multiple imputation package
mi.impact <-
  mice(
    impact.dataframe,
    predictorMatrix = pred.matrix,
    m = length(testing.folds) * length(testing.folds[[1]]),
    maxit = 30,
    seed = 2020,
    method = 'pmm',
    printFlag = TRUE
  )

# Save mice object
saveRDS(mi.impact,'../mice_impact.rds')

Observe diagnostic information of the imputation

# Load mice object
mi.impact <- readRDS('../mice_impact.rds')

# View mean and standard deviation iteration plots for MICE
plot(mi.impact)

# Compare distributions of imputed variables against their true distributions
stripplot(mi.impact, Hb~.imp)

stripplot(mi.impact, glu~.imp)

# Observe imputation densities
densityplot(mi.impact)

bwplot(mi.impact)

Save imputed dataframes into proper directories

# Names of repeats
repeat.names <- names(testing.folds)

# Names of folds
fold.names <- names(testing.folds[[1]])

# Create grid dataframe of all identifying names of folds
repeat.fold.names <- expand.grid(repeat.names,fold.names)
names(repeat.fold.names) <- c('repeat.name','fold.name')

# Create directory to store repeated-CV dataframes
dir.create('../repeated_cv',showWarnings = FALSE)

# Save multiple imputations as csv files into respective directories
for (i in  1:mi.impact$m){
  # Load current imputation
  curr.imp <- complete(mi.impact, action = i)
  
  # Get indexes of current repetition and fold
  curr.test.split <- testing.folds[[repeat.fold.names$repeat.name[i]]][[repeat.fold.names$fold.name[i]]]
  
  # Create directory for current repetition and fold
  dir.create(file.path('../repeated_cv',
                       repeat.fold.names$repeat.name[i],
                       repeat.fold.names$fold.name[i]),
             showWarnings = FALSE,
             recursive = TRUE)
  
  # Save full imputed dataset
  write.csv(curr.imp,
            file = file.path('../repeated_cv',
                             repeat.fold.names$repeat.name[i],
                             repeat.fold.names$fold.name[i],
                             'full_imputed_dataframe.csv'),
            row.names = FALSE)
  
  # Save imputed training dataset
  write.csv(curr.imp[-curr.test.split,],
            file = file.path('../repeated_cv',
                             repeat.fold.names$repeat.name[i],
                             repeat.fold.names$fold.name[i],
                             'train_dataframe.csv'),
            row.names = FALSE)
  
  # Save imputed testing dataset
  write.csv(curr.imp[curr.test.split,],
            file = file.path('../repeated_cv',
                             repeat.fold.names$repeat.name[i],
                             repeat.fold.names$fold.name[i],
                             'test_dataframe.csv'),
            row.names = FALSE)
  
  print(paste('Imputation no.',i,'complete'))
}

III. Box-Cox transform to normalize and scale dataset

Perform Box-Cox transformation to normalize training and testing datasets and then scale data

# Load function to fix IMPACT variable types in the dataframe
source('functions/fix_impact_dataframe.R')

# Load testing folds for repeated cross-validation
testing.folds <- readRDS('../testing_folds.rds')

# Identify columns to undergo Box-Cox normalization
bc.columns <- c('age','GCSm','Hb','glu','marshall')

# Initialize empty lists to store Box-Cox models
bc <- vector(mode = 'list')

# Loop through repeated-CV folds, train Box-Cox models, and transform predictor datasets
for (repeat.names in names(testing.folds)){
  for (fold.names in names(testing.folds[[repeat.names]])){
    # Load current imputed training set
    curr.training.set <- read.csv(file.path('../repeated_cv',repeat.names,fold.names,'train_dataframe.csv'))

    # Load current imputed testing set
    curr.testing.set <- read.csv(file.path('../repeated_cv',repeat.names,fold.names,'test_dataframe.csv'))
    
    for (curr.col in bc.columns){
      # Train boxcox model on current training set
      curr.bc <- boxcox(curr.training.set[,curr.col],
                        standardize = TRUE)
      # Replace training set values
      curr.training.set[,curr.col] <- curr.bc$x.t
      # Replace testing set values
      curr.testing.set[,curr.col] <- predict(curr.bc, newdata = curr.testing.set[,curr.col])
      # Store current box.cox object in compiling list
      bc[[repeat.names]][[fold.names]][[curr.col]] <- curr.bc
    }
    
    # Store box-coxed dataframes into respective fold directories
    write.csv(curr.training.set,file.path('../repeated_cv',repeat.names,fold.names,'norm_train_dataframe.csv'),row.names = FALSE)
    write.csv(curr.testing.set,file.path('../repeated_cv',repeat.names,fold.names,'norm_test_dataframe.csv'), row.names = FALSE)
    print(paste(repeat.names,fold.names,'complete.'))
  }
}

# Save Box-Cox transformation object list
saveRDS(bc,'../repeated_cv/box_cox.rds')

IV. Synthetic Minority Oversampling Technique

Perform Synthetic Minority Oversampling Technique (SMOTE) to rectify class imbalance as a hyperparameter during training

# Load function to fix variable types of pre-SMOTE'd dataset
source('functions/fix_smote_impact_dataframe.R')

# Load testing folds for repeated cross-validation
testing.folds <- readRDS('../testing_folds.rds')

# Loop through repeated-CV folds and apply SMOTE function to balance GOSE distributions in normalized training sets
for (repeat.names in names(testing.folds)){
  for (fold.names in names(testing.folds[[repeat.names]])){
    # Load current normalized imputed training set and remove non-IMPACT predictors
    curr.norm.training.set <- read.csv(file.path('../repeated_cv',repeat.names,fold.names,'norm_train_dataframe.csv')) %>%
      fix.smote.impact.dataframe() %>%
      dplyr::select(-c(entity_id,PatientType,GCS))
    
  # Print original GOSE class distribution
  print('Original GOSE class distribution in training set:')
  print(table(curr.norm.training.set$GOSE))
  
  # Apply SMOTE on training data with heterogeneous value difference metric
  curr.smote.norm.training.set <- SmoteClassif(GOSE ~ age + unreactive_pupils + GCSm + Hb + glu + hypoxia + hypotension + marshall + tsah + EDH,curr.norm.training.set,dist = "HVDM")
  
  # Print SMOTE'd GOSE class distribution
  print('SMOTEd GOSE class distribution in training set:')
  print(table(curr.smote.norm.training.set$GOSE))
  
  # Save SMOTE'd training data
  write.csv(curr.smote.norm.training.set,
            file.path('../repeated_cv',repeat.names,fold.names,'smote_norm_train_dataframe.csv'),
            row.names = FALSE)
  }
}